# library
library(ape) # manual: https://cran.r-project.org/web/packages/ape/ape.pdf
library(nlme) # manual: https://cran.r-project.org/web/packages/nlme/nlme.pdf
library(MASS)
library(car)
library(devtools)
library(tidyverse)
library(broom)
library(ggplot2)
library(ggpubr)

Correlations

piS without log transformation. Kendall is used

piS with log transformation - Linearity improves. Kendall is used

The observed trend above indicates a more efficient action of natural selection on species with higher Ne. The effect seems stronger between log10(piS) and omegaNA. To infer the significance of these correlations we need to take into account the phylogenetic signal.

# load data with omegaA and omegaNA plus per species factors for the two datasets

spp_summ_gene <- read.table("./Data/1_Dataset/20spp_CatGene_rates+piS+SppRho.txt",sep="|", header=TRUE)
spp_summ_gene$Category_gene2 <- factor(spp_summ_gene$Data, levels = c("Non-secreted", "Secreted", "Effectors")) # change order to ease comparison in the model
spp_summ_gene$piS_log <- log10(spp_summ_gene$piS)

remove_spp <- c("V. dahliae", "R. commune", "P. teres f. teres","M. oryzae rice", "F. graminearum", "C. parasitica")


spp_summ_gene <- spp_summ_gene[!spp_summ_gene$Species %in% remove_spp, ]


# Load tree
spp_tree_raw <- read.tree(file = "./Data/0_Taxonomy/14spp_phyliptree.phy", text = NULL, tree.names = NULL, skip = 0, comment.char = "", keep.multi = FALSE)

spp_tree_thinned_omegaA_raw <- read.tree(file = "./Data/0_Taxonomy/6spp_phyliptreeNamed.phy", text = NULL, tree.names = NULL, skip = 0, comment.char = "", keep.multi = FALSE)

spp_tree_thinned_omegaNA_raw <- read.tree(file = "./Data/0_Taxonomy/7_spp_omegaNA_rec.phy", text = NULL, tree.names = NULL, skip = 0, comment.char = "", keep.multi = FALSE)

# match label
rename_tree <- function(tree_object) {
  tree_object$tip.label <- gsub("'","",tree_object$tip.label)
  tree_object$tip.label <- gsub("Parastagonospora nodorum","P. nodorum",tree_object$tip.label)
  tree_object$tip.label <- gsub("Venturia inaequalis","V. inaequalis",tree_object$tip.label)
  tree_object$tip.label <- gsub("Penicillium biforme","P. biforme",tree_object$tip.label)
  tree_object$tip.label <- gsub("Ophiostoma novo-ulmi subsp. novo-ulmi","O. novo-ulmi subsp. novo-ulmi",tree_object$tip.label)
  tree_object$tip.label <- gsub("Sphaerulina musiva","S. musiva",tree_object$tip.label)
  tree_object$tip.label <- gsub("Zymoseptoria tritici","Z. tritici",tree_object$tip.label)
  tree_object$tip.label <- gsub("Ophiostoma novo-ulmi subsp. americana","O. novo-ulmi subsp. americana",tree_object$tip.label)
  tree_object$tip.label <- gsub("Ophiostoma ulmi","O. ulmi",tree_object$tip.label)
  tree_object$tip.label <- gsub("Cercospora beticola","C. beticola",tree_object$tip.label)
  tree_object$tip.label <- gsub("Botrytis cinerea","B. cinera",tree_object$tip.label)
  tree_object$tip.label <- gsub("Sclerotinia sclerotiorum","S. sclerotiorum",tree_object$tip.label)
  tree_object$tip.label <- gsub("Pyricularia oryzae 4091-5-8","M. oryzae Triticum",tree_object$tip.label)
  tree_object$tip.label <- gsub("Neurospora discreta","N. discreta",tree_object$tip.label)
  tree_object$tip.label <- gsub("Aspergillus flavus","A. flavus",tree_object$tip.label)
  tree_object$tip.label <- gsub("Fusarium graminearum","F. graminearum",tree_object$tip.label)
  tree_object$tip.label <- gsub("Cryphonectria parasitica","C. parasitica",tree_object$tip.label)
  tree_object$tip.label <- gsub("Pyricularia oryzae 70-15","M. oryzae rice",tree_object$tip.label)
  tree_object$tip.label <- gsub("Pyrenophora teres f. teres","P. teres f. teres",tree_object$tip.label)
  tree_object$tip.label <- gsub("Rhynchosporium commune","R. commune",tree_object$tip.label)
  tree_object$tip.label <- gsub("Verticillium dahliae","V. dahliae",tree_object$tip.label)
  return(tree_object)
}

spp_tree <- rename_tree(spp_tree_raw)

#spp_summ_gene$Species %in% spp_tree$tip.label
#spp_summ_rec$Species %in% spp_tree$tip.label # add into the model..
# Function for convenience:
normtest <- function(model) {
  return(shapiro.test(resid(model)))
}
indeptest <- function(model) {
  return(Box.test(resid(model)[order(fitted(model))], type = "Ljung-Box"))
}

Tree for phylogenetic correction

# check tree. This tree is simply taken from NCBI taxonomy
plot(spp_tree)


Dataset: Protein-coding genes predicted as Non-secreted, Secreted, Effectors

Visual check for omegaA and omegaNA - Gene category

Looking at the plots above it’s possible to see an effect of gene function, mostly effectors, on omegaA and omegaNA for some species. This effect is however not widespread

Variable: omegaA

Model 1: gls using default restricted maximum likelihood

m_omegaA_gene <- gls(omegaA ~ Category_gene2 + piS_log, spp_summ_gene, correlation=corGrafen(1, spp_tree, form = ~Species))
hist(resid(m_omegaA_gene))

plot(resid(m_omegaA_gene)~fitted(m_omegaA_gene))

normtest(m_omegaA_gene)
## 
##  Shapiro-Wilk normality test
## 
## data:  resid(model)
## W = 0.99339, p-value = 0.997
indeptest(m_omegaA_gene)
## 
##  Box-Ljung test
## 
## data:  resid(model)[order(fitted(model))]
## X-squared = 1.8555, df = 1, p-value = 0.1731
summary(m_omegaA_gene)
## Generalized least squares fit by REML
##   Model: omegaA ~ Category_gene2 + piS_log 
##   Data: spp_summ_gene 
##         AIC       BIC   logLik
##   -91.20471 -81.37919 51.60235
## 
## Correlation Structure: corGrafen
##  Formula: ~Species 
##  Parameter estimate(s):
##       rho 
## 0.1364945 
## 
## Coefficients:
##                              Value  Std.Error   t-value p-value
## (Intercept)             0.07582192 0.05333494 1.4216182  0.1633
## Category_gene2Secreted  0.02244116 0.01439549 1.5589023  0.1273
## Category_gene2Effectors 0.03641598 0.01439549 2.5296800  0.0157
## piS_log                 0.00025630 0.02631897 0.0097381  0.9923
## 
##  Correlation: 
##                         (Intr) Ctg_2S Ctg_2E
## Category_gene2Secreted   0.134              
## Category_gene2Effectors -0.418  0.053       
## piS_log                  0.871  0.103 -0.103
## 
## Standardized residuals:
##         Min          Q1         Med          Q3         Max 
## -2.02274749 -0.48854713  0.06739147  0.73310993  2.26165428 
## 
## Residual standard error: 0.05526628 
## Degrees of freedom: 42 total; 38 residual
# calculate the VIF for each predictor variable in the model
vif(m_omegaA_gene)
##                    GVIF Df GVIF^(1/(2*Df))
## Category_gene2 1.022961  2        1.005691
## piS_log        1.022961  1        1.011415
# create vector of VIF values
vif_values <- vif(m_omegaA_gene)[,1]

# create horizontal bar chart to display each VIF value
barplot(vif_values, main = "GVIF", horiz = TRUE, col = "steelblue", xlim = c(0,6), cex.axis=1, cex.names=0.5) + abline(v = 5, lwd = 3, lty = 2)

## numeric(0)

Model 2: gls using default restricted maximum likelihood and bcnPower transformation to improve normality

summary(p1 <- powerTransform(omegaA ~ Category_gene2 + piS_log, spp_summ_gene, family = "bcnPower"))
spp_summ_gene$omegaA_trans <- bcnPower(spp_summ_gene$omegaA, lambda = p1$lambda, gamma = p1$gamma)
m_omegaA_gene.trans <- gls(omegaA_trans ~ Category_gene2 + piS_log, spp_summ_gene, correlation=corGrafen(1, spp_tree, form = ~Species)) 
hist(resid(m_omegaA_gene.trans))

plot(resid(m_omegaA_gene.trans)~fitted(m_omegaA_gene.trans))

normtest(m_omegaA_gene.trans)
## 
##  Shapiro-Wilk normality test
## 
## data:  resid(model)
## W = 0.98956, p-value = 0.9631
indeptest(m_omegaA_gene.trans)
## 
##  Box-Ljung test
## 
## data:  resid(model)[order(fitted(model))]
## X-squared = 1.4113, df = 1, p-value = 0.2348
m_omegaA_gene.trans <- gls(omegaA_trans ~ Category_gene2 + piS_log, spp_summ_gene, correlation=corGrafen(1, spp_tree, form = ~Species)) 
summary(m_omegaA_gene.trans)
## Generalized least squares fit by REML
##   Model: omegaA_trans ~ Category_gene2 + piS_log 
##   Data: spp_summ_gene 
##        AIC      BIC     logLik
##   11.90051 21.72602 0.04974725
## 
## Correlation Structure: corGrafen
##  Formula: ~Species 
##  Parameter estimate(s):
##       rho 
## 0.1081643 
## 
## Coefficients:
##                              Value  Std.Error   t-value p-value
## (Intercept)             -1.8046415 0.20364341 -8.861772  0.0000
## Category_gene2Secreted   0.0875076 0.05634282  1.553129  0.1287
## Category_gene2Effectors  0.1333529 0.05634282  2.366812  0.0231
## piS_log                  0.0145549 0.10258236  0.141885  0.8879
## 
##  Correlation: 
##                         (Intr) Ctg_2S Ctg_2E
## Category_gene2Secreted   0.119              
## Category_gene2Effectors -0.406  0.035       
## piS_log                  0.875  0.088 -0.088
## 
## Standardized residuals:
##         Min          Q1         Med          Q3         Max 
## -2.11105089 -0.48653192  0.03925545  0.77322652  1.99152341 
## 
## Residual standard error: 0.2145121 
## Degrees of freedom: 42 total; 38 residual

Variable: omegaNA

Model 1: gls using default restricted maximum likelihood

m_omegaNA_gene <- gls(omegaNA ~ Category_gene2 + piS_log, spp_summ_gene, correlation=corGrafen(1, spp_tree, form = ~Species))
hist(resid(m_omegaNA_gene))

plot(resid(m_omegaNA_gene)~fitted(m_omegaNA_gene))

normtest(m_omegaNA_gene)
## 
##  Shapiro-Wilk normality test
## 
## data:  resid(model)
## W = 0.94553, p-value = 0.04474
indeptest(m_omegaNA_gene)
## 
##  Box-Ljung test
## 
## data:  resid(model)[order(fitted(model))]
## X-squared = 3.2817, df = 1, p-value = 0.07006
summary(m_omegaNA_gene)
## Generalized least squares fit by REML
##   Model: omegaNA ~ Category_gene2 + piS_log 
##   Data: spp_summ_gene 
##        AIC       BIC  logLik
##   -77.0674 -67.24188 44.5337
## 
## Correlation Structure: corGrafen
##  Formula: ~Species 
##  Parameter estimate(s):
##       rho 
## 0.1890623 
## 
## Coefficients:
##                              Value  Std.Error   t-value p-value
## (Intercept)             0.14117219 0.06581329 2.1450406  0.0384
## Category_gene2Secreted  0.00896762 0.01705849 0.5256984  0.6022
## Category_gene2Effectors 0.03996095 0.01705849 2.3425851  0.0245
## piS_log                 0.02109889 0.03146361 0.6705806  0.5065
## 
##  Correlation: 
##                         (Intr) Ctg_2S Ctg_2E
## Category_gene2Secreted   0.147              
## Category_gene2Effectors -0.429  0.089       
## piS_log                  0.866  0.124 -0.124
## 
## Standardized residuals:
##        Min         Q1        Med         Q3        Max 
## -2.0288494 -0.9425169 -0.6620065  0.3798331  2.5824111 
## 
## Residual standard error: 0.0665931 
## Degrees of freedom: 42 total; 38 residual
# calculate the VIF for each predictor variable in the model
vif(m_omegaNA_gene)
##                    GVIF Df GVIF^(1/(2*Df))
## Category_gene2 1.034657  2        1.008554
## piS_log        1.034657  1        1.017181
# create vector of VIF values
vif_values <- vif(m_omegaNA_gene)[,1]

# create horizontal bar chart to display each VIF value
barplot(vif_values, main = "GVIF", horiz = TRUE, col = "steelblue", xlim = c(0,6), cex.axis=1, cex.names=0.5) + abline(v = 5, lwd = 3, lty = 2)

## numeric(0)

Model 2: gls using default restricted maximum likelihood and bcnPower transformation to improve normality

summary(p1 <- powerTransform(omegaNA ~ Category_gene2 + piS_log, spp_summ_gene, family = "bcnPower"))
spp_summ_gene$omegaNA_trans <- bcnPower(spp_summ_gene$omegaNA, lambda = p1$lambda, gamma = p1$gamma)
m_omegaNA_gene.trans <- gls(omegaNA_trans ~ Category_gene2 + piS_log, spp_summ_gene, correlation=corGrafen(1, spp_tree, form = ~Species)) 
hist(resid(m_omegaNA_gene.trans))

plot(resid(m_omegaNA_gene.trans)~fitted(m_omegaNA_gene.trans))

normtest(m_omegaNA_gene.trans)
## 
##  Shapiro-Wilk normality test
## 
## data:  resid(model)
## W = 0.96677, p-value = 0.2563
indeptest(m_omegaNA_gene.trans)
## 
##  Box-Ljung test
## 
## data:  resid(model)[order(fitted(model))]
## X-squared = 5.0832, df = 1, p-value = 0.02416
m_omegaNA_gene.trans <- gls(omegaNA_trans ~ Category_gene2 + piS_log, spp_summ_gene, correlation=corGrafen(1, spp_tree, form = ~Species)) 
summary(m_omegaNA_gene.trans)
## Generalized least squares fit by REML
##   Model: omegaNA_trans ~ Category_gene2 + piS_log 
##   Data: spp_summ_gene 
##        AIC     BIC    logLik
##   119.9635 129.789 -53.98176
## 
## Correlation Structure: corGrafen
##  Formula: ~Species 
##  Parameter estimate(s):
##      rho 
## 0.223764 
## 
## Coefficients:
##                              Value Std.Error   t-value p-value
## (Intercept)             -2.9777675 0.8893300 -3.348327  0.0018
## Category_gene2Secreted   0.1396496 0.2254763  0.619354  0.5394
## Category_gene2Effectors  0.5616736 0.2254763  2.491054  0.0172
## piS_log                  0.3881061 0.4181003  0.928261  0.3591
## 
##  Correlation: 
##                         (Intr) Ctg_2S Ctg_2E
## Category_gene2Secreted   0.149              
## Category_gene2Effectors -0.431  0.113       
## piS_log                  0.864  0.132 -0.132
## 
## Standardized residuals:
##        Min         Q1        Med         Q3        Max 
## -2.8403201 -1.0707549 -0.7223019  0.6516418  1.9886622 
## 
## Residual standard error: 0.8899668 
## Degrees of freedom: 42 total; 38 residual

Check for differences between Secreted and Effectors

Variable: omegaA

Model 2: gls using default restricted maximum likelihood and bcnPower transformation to improve normality

spp_summ_gene$Category_gene2 <- factor(spp_summ_gene$Data, levels = c("Secreted", "Non-secreted", "Effectors")) # change order to ease comparison in the model

summary(p1 <- powerTransform(omegaA ~ Category_gene2 + piS_log, spp_summ_gene, family = "bcnPower"))
spp_summ_gene$omegaA_trans <- bcnPower(spp_summ_gene$omegaA, lambda = p1$lambda, gamma = p1$gamma)
m_omegaA_gene.trans <- gls(omegaA_trans ~ Category_gene2 + piS_log, spp_summ_gene, correlation=corGrafen(1, spp_tree, form = ~Species)) 
hist(resid(m_omegaA_gene.trans))

plot(resid(m_omegaA_gene.trans)~fitted(m_omegaA_gene.trans))

normtest(m_omegaA_gene.trans)
## 
##  Shapiro-Wilk normality test
## 
## data:  resid(model)
## W = 0.98956, p-value = 0.9631
indeptest(m_omegaA_gene.trans)
## 
##  Box-Ljung test
## 
## data:  resid(model)[order(fitted(model))]
## X-squared = 1.4113, df = 1, p-value = 0.2348
m_omegaA_gene.trans <- gls(omegaA_trans ~ Category_gene2 + piS_log, spp_summ_gene, correlation=corGrafen(1, spp_tree, form = ~Species)) 
summary(m_omegaA_gene.trans)
## Generalized least squares fit by REML
##   Model: omegaA_trans ~ Category_gene2 + piS_log 
##   Data: spp_summ_gene 
##        AIC      BIC     logLik
##   11.90051 21.72602 0.04974727
## 
## Correlation Structure: corGrafen
##  Formula: ~Species 
##  Parameter estimate(s):
##       rho 
## 0.1081643 
## 
## Coefficients:
##                                 Value  Std.Error   t-value p-value
## (Intercept)                -1.7171339 0.21768180 -7.888275  0.0000
## Category_gene2Non-secreted -0.0875076 0.05634282 -1.553129  0.1287
## Category_gene2Effectors     0.0458452 0.07825866  0.585816  0.5615
## piS_log                     0.0145549 0.10258236  0.141885  0.8879
## 
##  Correlation: 
##                            (Intr) Ct_2N- Ctg_2E
## Category_gene2Non-secreted -0.371              
## Category_gene2Effectors    -0.534  0.694       
## piS_log                     0.842 -0.088 -0.126
## 
## Standardized residuals:
##         Min          Q1         Med          Q3         Max 
## -2.11105089 -0.48653192  0.03925545  0.77322652  1.99152341 
## 
## Residual standard error: 0.2145121 
## Degrees of freedom: 42 total; 38 residual

Variable: omegaNA

Model 1: gls using default restricted maximum likelihood

m_omegaNA_gene <- gls(omegaNA ~ Category_gene2 + piS_log, spp_summ_gene, correlation=corGrafen(1, spp_tree, form = ~Species))
hist(resid(m_omegaNA_gene))

plot(resid(m_omegaNA_gene)~fitted(m_omegaNA_gene))

normtest(m_omegaNA_gene)
## 
##  Shapiro-Wilk normality test
## 
## data:  resid(model)
## W = 0.94553, p-value = 0.04474
indeptest(m_omegaNA_gene)
## 
##  Box-Ljung test
## 
## data:  resid(model)[order(fitted(model))]
## X-squared = 3.2817, df = 1, p-value = 0.07006
summary(m_omegaNA_gene)
## Generalized least squares fit by REML
##   Model: omegaNA ~ Category_gene2 + piS_log 
##   Data: spp_summ_gene 
##        AIC       BIC  logLik
##   -77.0674 -67.24188 44.5337
## 
## Correlation Structure: corGrafen
##  Formula: ~Species 
##  Parameter estimate(s):
##       rho 
## 0.1890623 
## 
## Coefficients:
##                                  Value  Std.Error    t-value p-value
## (Intercept)                 0.15013980 0.07037398  2.1334561  0.0394
## Category_gene2Non-secreted -0.00896762 0.01705849 -0.5256984  0.6022
## Category_gene2Effectors     0.03099333 0.02303145  1.3456964  0.1864
## piS_log                     0.02109889 0.03146361  0.6705806  0.5065
## 
##  Correlation: 
##                            (Intr) Ct_2N- Ctg_2E
## Category_gene2Non-secreted -0.380              
## Category_gene2Effectors    -0.563  0.675       
## piS_log                     0.840 -0.124 -0.183
## 
## Standardized residuals:
##        Min         Q1        Med         Q3        Max 
## -2.0288494 -0.9425169 -0.6620065  0.3798331  2.5824111 
## 
## Residual standard error: 0.0665931 
## Degrees of freedom: 42 total; 38 residual